/*
 * Saphre - Suffix Arrays for Phrase Extraction
 * Copyright (C) 2013 
 * Dale Gerdemann Tübingen, Germany 
 * Niko Schenk Frankfurt am Main, Germany
 * All rights reserved.
 *
 * This program is free software: you can redistribute it and/or modify it under
 * the terms of the GNU General Public License as published by the Free Software
 * Foundation, either version 3 of the License, or (at your option) any later
 * version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program. If not, see <http://www.gnu.org/licenses/>.
 *
 */
package collectors.impl;

import java.io.PrintWriter;
import util.sorting.Multiset;
import collectors.api.Collector1;
import util.Interval;
import saphre.core.Store;
import saphre.core.SuffixArray;

/**
 * Class which simulates lcp-interval tree traversal to collect maximal repeats
 * from a corpus.
 *
 * A maximal repeat is a repeat which - extended in one direction to either the
 * left or right - loses at least one of its occurrences. So, for example,
 * assume that "York" occurs 5 times in a corpus. Now, if "New York" also occurs
 * 5 times in the corpus, we say that "York" is NOT (left-)maximal! (because it
 * can be extended to the left by one token without losing any of the original
 * number of occurrences (5). Assume further, that if "York" again appeared 5
 * times, but "New York" only twice in the whole corpus, than "York" would be
 * maximal. (because extending it to the left results in a term frequency loss
 * of 3.
 *
 * Due to the nature of its construction, the lcp-interval tree gives us right
 * maximality for free, i.e. each non-trivial interval (repeat) is
 * right-maximal.
 *
 * Note, that we extended the notion of "maximality" to consider different
 * degrees of maximality. For example, a phrase which occurs 6 times in total
 * and which extends 3 times to the left to form the SAME extended n-gram, is
 * said to be 50% maximal. (cf. Schenk 2013)
 *
 * Run with: show:maximals minTf=200
 *
 *
 * It implements the Collector interface.
 *
 * @author Dale Gerdemann, Niko Schenk
 */
public class MaximalsCollector implements Collector1 {

    Store store;
    SuffixArray sa;
    double D;
    PrintWriter pw = null;
    private int minLengthOfPhrase;
    private int maxLengthOfPhrase;
    private int minTermFrequency;
    private String startsWith;
    private static int maximalcount = 0;
    private static int nonmaximalcount = 0;

    /**
     * Collect maximal phrases.
     *
     * @param store
     * @param sa
     * @param top
     * @param pw
     * @param args
     */
    public MaximalsCollector(Store store, SuffixArray sa, PrintWriter pw, int aMinLengthOfPhrase,
            int aMaxLengthOfPhrase, int aMinTermFrequency, String aStartsWith) {
        this.store = store;
        this.sa = sa;
        //D = (double) store.numDocs();
        // Subtract 2! The program uses two initial and one final sentinel.
        D = (double) store.numDocs() - 2;
        this.pw = pw;

        this.minLengthOfPhrase = aMinLengthOfPhrase;
        this.maxLengthOfPhrase = aMaxLengthOfPhrase;
        this.minTermFrequency = aMinTermFrequency;
        this.startsWith = aStartsWith;
    }

    public void addPrefixArray(SuffixArray pa) {
    }

    /**
     *
     * @param inter
     * @param parentLcp
     * @param lcp
     * @param depth
     */
    @Override
    public void preAdd(Interval inter, int parentLcp, int lcp, int depth) {
    }

    /**
     *
     * @param inter
     * @param lcp
     * @param depth
     * @param docDist
     * @param leftContext
     */
    @Override
    public void add(Interval inter, int lcp, int depth, Multiset docDist,
            Multiset leftContext) {

        Multiset dd = inter.docDist(sa, store);
        int tf = inter.tf();
        int df = dd.size();

        // Limit on the length of phrase.
        if (lcp >= (minLengthOfPhrase)
                && lcp <= maxLengthOfPhrase
                && tf >= minTermFrequency) {

            int loc = sa.getSuftab()[inter.lb()];
            String ngram = store.toString(loc, (loc + lcp));

            int ngramsize = lcp;
            int[] generalngram = new int[ngramsize];
            int ngramindex = 0;
            // Collect this ngram. Every array index is occupied by an individual token/character.
            for (int l = 0; l < ngramsize; l++) {
                generalngram[l] = sa.getText()[loc + ngramindex];
                ngramindex++;
            }

            boolean isMaximal = false;

            boolean leftVariable = false;
            int b = store.text()[sa.getSuftab()[inter.lb()] - 1];
            for (int i = inter.lb(); i <= inter.rb(); i++) {
                int oneBefore = store.text()[sa.getSuftab()[i] - 1];
                if (oneBefore != b) {
                    leftVariable = true;
                }
            }

            if (leftVariable) {
                isMaximal = true;
                maximalcount++;
            } else {
                nonmaximalcount++;
            }

            if (isMaximal && ngram.startsWith(startsWith)) {
                String rval = ngram + "\t" + isMaximal + "\t" + ngramsize + "\t" + tf + "\t" + df + "\n";
                if(pw!=null) {
                    pw.print(rval);
                }
                System.out.print(rval);
            }
        }
    }

    /**
     *
     * @param inter
     * @param lcp
     * @param depth
     * @param docDist
     * @param leftContext
     */
    @Override
    public void addTrivial(Interval inter, int lcp, int depth,
            Multiset docDist, Multiset leftContext) {
    }

    /**
     *
     * @return
     */
    public static int getMaximalcount() {
        return maximalcount;
    }

    /**
     *
     * @return
     */
    public static int getNonMaximalcount() {
        return nonmaximalcount;
    }
}